Part 1: RNA

Load RNA samples

Out of 30 samples, we selected 17 for this study. These are the normal tissue samples form the control, the UVA and the UVA+SFN treatment groups. normal tissue samples from the UVB_UA groups as well as tumor samples were excluded from this analysis. Additionally, one of the control samples at Week 2 (baseline) was removed after outlier analysis.
7,219 genes with zero counts in > 80% (> 13 out of 18) of samples were removed. 17,202 out of 24,421 genes were left.

[1] 7219
[1] 17202

Transcripts per kilobase million (TPM) normalization

Next, we noramized the counts. To convert number of hits to the relative abundane of genes in each sample, we used transcripts per kilobase million (TPM) normalization, which is as following for the j-th sample:
1. normilize for gene length: a[i, j] = 1,000*count[i, j]/gene[i, j] length(bp)
2. normalize for seq depth (i.e. total count): a(i, j)/sum(a[, j])
3. multiply by one million
A very good comparison of normalization techniques can be found at the following video:
RPKM, FPKM and TPM, clearly explained

After the normalization, each sample’s total is 1M:

02w_CON_0 02w_SFN_0 02w_SFN_1 02w_UVB_0 02w_UVB_1 15w_CON_0 15w_CON_1 15w_SFN_0 
    1e+06     1e+06     1e+06     1e+06     1e+06     1e+06     1e+06     1e+06 
15w_SFN_1 15w_UVB_0 15w_UVB_1 25w_CON_0 25w_CON_1 25w_SFN_0 25w_SFN_1 25w_UVB_0 
    1e+06     1e+06     1e+06     1e+06     1e+06     1e+06     1e+06     1e+06 
25w_UVB_1 
    1e+06 

Top 100 most abundant RNA molecules

# Separate top 100 abundant genes
tmp <- droplevels(tpm[Geneid %in% levels(tpm$Geneid)[(nrow(tpm) - 99):nrow(tpm)]])

tmp <- melt.data.table(data = tmp,
                       id.vars = 1:2,
                       measure.vars = 3:ncol(tmp),
                       variable.name = "Sample",
                       value.name = "TPM")

tmp$Week <- substr(x = tmp$Sample,
                   start = 1,
                   stop = 3)
tmp$Week <- factor(tmp$Week,
                   levels = unique(tmp$Week))


tmp$Treatment <- substr(x = tmp$Sample,
                        start = 5,
                        stop = 7)
tmp$Treatment <- factor(tmp$Treatment,
                        levels = c("CON", 
                                   "UVB",
                                   "SFN"))

tmp$Replica <- substr(x = tmp$Sample,
                      start = 9,
                      stop = 9)
tmp$Replica <- factor(tmp$Replica,
                      levels = 0:1)

# Plot top 100 abundant genes
p2 <- ggplot(tmp,
             aes(x = TPM,
                 y = Geneid,
                 fill = Treatment,
                 shape = Week)) +
  # facet_wrap(~ Sex, nrow = 1) +
  geom_point(size = 3,
             alpha = 0.5) +
  geom_vline(xintercept = 1,
             linetype = "dashed")
ggplotly(p2)

Bottom 100 least abundant RNA molecules

tmp <- droplevels(tpm[Geneid %in% levels(tpm$Geneid)[1:100]])

tmp <- melt.data.table(data = tmp,
                       id.vars = 1:2,
                       measure.vars = 3:ncol(tmp),
                       variable.name = "Sample",
                       value.name = "TPM")

tmp$Week <- substr(x = tmp$Sample,
                   start = 1,
                   stop = 3)
tmp$Week <- factor(tmp$Week,
                   levels = unique(tmp$Week))


tmp$Treatment <- substr(x = tmp$Sample,
                        start = 5,
                        stop = 7)
tmp$Treatment <- factor(tmp$Treatment,
                        levels = c("CON", 
                                   "UVB",
                                   "SFN"))

tmp$Replica <- substr(x = tmp$Sample,
                      start = 9,
                      stop = 9)
tmp$Replica <- factor(tmp$Replica,
                      levels = 0:1)

# Plot top 100 abundant genes
p3 <- ggplot(tmp,
             aes(x = TPM,
                 y = Geneid,
                 fill = Treatment,
                 shape = Week)) +
  # facet_wrap(~ Sex, nrow = 1) +
  geom_point(size = 3,
             alpha = 0.5) +
  geom_vline(xintercept = 1,
             linetype = "dashed")
ggplotly(p3)

Meta data

dmeta <- data.table(Sample = colnames(dt1)[-c(1:2)])

dmeta$time <- substr(x = dmeta$Sample,
                     start = 1,
                     stop = 3)
dmeta$time <- factor(dmeta$time,
                     levels = c("02w",
                                "15w",
                                "25w"))
dmeta$Week <- factor(dmeta$time,
                     levels = c("02w",
                                "15w",
                                "25w"),
                     labels = c("Week 2",
                                "Week 15",
                                "Week 25"))

dmeta$trt <- substr(x = dmeta$Sample,
                    start = 5,
                    stop = 7)
dmeta$trt <- factor(dmeta$trt,
                    levels = c("CON", 
                               "UVB",
                               "SFN"))
dmeta$Treatment <- factor(dmeta$trt,
                          levels = c("CON", 
                                     "UVB",
                                     "SFN"),
                          labels = c("Negative Control",
                                     "Positive Control (UVB)",
                                     "Sulforaphane (SFN)"))

dmeta$Replica <- substr(x = dmeta$Sample,
                        start = 9,
                        stop = 9)
dmeta$Replica <- factor(dmeta$Replica,
                        levels = 0:1)

datatable(dmeta,
          options = list(pageLength = nrow(dmeta)))

PCA of TPM

NOTE: the distributions are skewed. To make them symmetric, log transformation is often applied. However, there is an issue of zeros. In this instance, we added a small values lambda[i] equal to 1/10 of the smallest non-zero value of i-th gene.

dm.tpm <- as.matrix(tpm[, -c(1:2), with = FALSE])
rownames(dm.tpm) <- tpm$Geneid

# # Remove 02w_CON_1 sample and redo PCA
# dm.tpm <- dm.tpm[, colnames(dm.tpm) != "02w_CON_1"]
# dmeta <- dmeta[dmeta$Sample != "02w_CON_1", ]

# Add lambdas to all values, then take a log
dm.ltpm <- t(apply(X = dm.tpm,
                      MARGIN = 1,
                      FUN = function(a) {
                        lambda <- min(a[a > 0])/10
                        log(a + lambda)
                      }))

# PCA----
m1 <- prcomp(t(dm.ltpm),
             center = TRUE,
             scale. = TRUE)

s1 <- summary(m1)
s1
Importance of components:
                           PC1     PC2     PC3      PC4      PC5      PC6      PC7
Standard deviation     66.5041 61.8206 45.2845 30.42909 28.24422 26.84136 25.01865
Proportion of Variance  0.2571  0.2222  0.1192  0.05383  0.04637  0.04188  0.03639
Cumulative Proportion   0.2571  0.4793  0.5985  0.65232  0.69869  0.74058  0.77696
                            PC8      PC9     PC10     PC11    PC12     PC13
Standard deviation     23.05989 22.08373 21.24391 20.87624 20.6980 20.28169
Proportion of Variance  0.03091  0.02835  0.02624  0.02534  0.0249  0.02391
Cumulative Proportion   0.80788  0.83623  0.86246  0.88780  0.9127  0.93662
                           PC14     PC15     PC16      PC17
Standard deviation     19.42403 19.14803 18.61200 2.085e-13
Proportion of Variance  0.02193  0.02131  0.02014 0.000e+00
Cumulative Proportion   0.95855  0.97986  1.00000 1.000e+00

Pareto chart of variance explained by principal components

imp <- data.table(PC = colnames(s1$importance),
                  Variance = 100*s1$importance[2, ],
                  Cumulative = 100*s1$importance[3, ])
imp$PC <- factor(imp$PC,
                 levels = imp$PC)
p1 <- ggplot(imp,
             aes(x = PC,
                 y = Variance)) +
  geom_bar(stat = "identity",
           fill = "grey",
           color = "black") +
  geom_line(aes(y = rescale(Cumulative,
                            to = c(min(Cumulative)*30/100,
                                   30)),
                group = rep(1, nrow(imp)))) +
  geom_point(aes(y = rescale(Cumulative,
                             to = c(min(Cumulative)*30/100,
                                    30)))) +
  scale_y_continuous("% Variance Explained",
                     breaks = seq(0, 30, by = 5),
                     labels = paste(seq(0, 30, by = 5),
                                    "%",
                                    sep = ""),
                     sec.axis = sec_axis(trans = ~.,
                                         name = "% Cumulative Variance",
                                         breaks = seq(0, 30, length.out = 5),
                                         labels = paste(seq(0, 100, length.out = 5),
                                                        "%",
                                                        sep = ""))) +
  scale_x_discrete("") +
  theme(axis.text.x = element_text(angle = 90,
                                   hjust = 1))
p1

First 3 principal components, pairwise

# Biplot while keep only the most important variables (Javier)----
# Select PC-s to pliot (PC1 & PC2)
choices <- c(1:3)

# Scores, i.e. points (df.u)
dt.scr <- data.table(m1$x[, choices])
# Add grouping variables
dt.scr$trt <- dmeta$trt
dt.scr$time <- dmeta$time
dt.scr$sample <- dmeta$Sample

# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)

# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[choices], 
                     sprintf('(%0.1f%% explained var.)', 
                             100*m1$sdev[choices]^2/sum(m1$sdev^2)))

p1 <- ggplot(data = dt.scr,
             aes(x = PC1,
                 y = PC2,
                 color = trt,
                 shape = time)) +
  geom_point(size = 4,
             alpha = 0.5) +
  scale_x_continuous(u.axis.labs[1]) +
  scale_y_continuous(u.axis.labs[2])
ggplotly(p1)


p1 <- ggplot(data = dt.scr,
             aes(x = PC1,
                 y = PC3,
                 color = trt,
                 shape = time)) +
  geom_point(size = 4,
             alpha = 0.5) +
  scale_x_continuous(u.axis.labs[1]) +
  scale_y_continuous(u.axis.labs[3])
ggplotly(p1)


p1 <- ggplot(data = dt.scr,
             aes(x = PC2,
                 y = PC3,
                 color = trt,
                 shape = time)) +
  geom_point(size = 4,
             alpha = 0.5) +
  scale_x_continuous(u.axis.labs[2]) +
  scale_y_continuous(u.axis.labs[3])
ggplotly(p1)

First 3 principal components, 3D

scatterplot3js(x = dt.scr$PC1, 
               y = dt.scr$PC2, 
               z = dt.scr$PC3, 
               color = as.numeric(dt.scr$trt),
               renderer = "auto",
               pch = dt.scr$sample,
               size = 0.1)

Differential expression analysis (DESeq2 pipeline)

Sources:
1. Analyzing RNA-seq data with DESeq2:Interactions
2. Bioconductor Question: DESeq2 time series analysis
We are testing a model with time*treatment interaction. The idea here is to find genes with significant interaction term. That would suggest that the gene expressiondifferences between the treatments depended on time. THere are several possible scenarios:
a. No difference between the negative control and the positive control groups at baseline, significant difference at the later time point. This will show the effect of the disease (UVB radiation, in this case).
b. Significant difference between the control groups at baseline, no difference at the later time point. Same as (a) above.
c. Differences between the positive control and the SFN-treated groups. Here, we are interested in the reversal of UVB effect. Again, the interaction term will need to be significant for the reasons described above.

First, we will compare weeks 2 and 15. Previous analyses showed that majority of cahnges happend at by this time. far less genes were differentially expressed between weeks 15 and 25.

# Baseline vs. Week 15 only
dmeta.15 <- droplevels(dmeta[Week != "Week 25", ])

# Relevel: make all comparisons with the positive control (UVB)
dmeta.15$trt <- factor(dmeta.15$trt,
                       levels = c("UVB",
                                  "CON",
                                  "SFN"))

dtm.15 <- as.matrix(dt1[, dmeta.15$Sample,
                        with = FALSE])
rownames(dtm.15) <- dt1$Geneid

dds <- DESeqDataSetFromMatrix(countData = dtm.15, 
                              colData = dmeta.15,
                              ~ time + trt + time:trt)
# If all samples contain zeros, geometric means cannot be
# estimated. Change default 'type = "ratio"' to 'type = "poscounts"'.
# Type '?DESeq2::estimateSizeFactors' for more details.
dds <- estimateSizeFactors(object = dds,
                           type = "poscounts")

# Run DESeq----
dds <- DESeq(object = dds,
             # test = "LRT",
             # reduced = ~ time + trt,
             fitType = "local",
             sfType = "ratio",
             parallel = FALSE)
using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
# NOTE (from DESeq help file, section Value):
# A DESeqDataSet object with results stored as metadata columns. 
# These results should accessed by calling the results function. 
# By default this will return the log2 fold changes and p-values
# for the last variable in the design formula. 
# See results for how to access results for other variables.
# In this case, the last term is the interaction term trt:time

resultsNames(dds)
[1] "Intercept"       "time_15w_vs_02w" "trt_CON_vs_UVB"  "trt_SFN_vs_UVB" 
[5] "time15w.trtCON"  "time15w.trtSFN" 
# model.matrix(~ time + trt + time:trt, dmeta.15)

Results: interactions terms

Tests if the effect of NOT treating with UVB vs. treating with UVB is different at Week 15 compared to Week 2:

res_int_con_uvb_week <- results(dds, 
                                name = "time15w.trtCON",
                                alpha = 0.1)
res_int_con_uvb_week <- res_int_con_uvb_week[order(res_int_con_uvb_week$padj,
                                                   decreasing = FALSE),]
print(res_int_con_uvb_week)
log2 fold change (MLE): time15w.trtCON 
Wald test p-value: time15w.trtCON 
DataFrame with 17202 rows and 6 columns
                 baseMean     log2FoldChange             lfcSE               stat
                <numeric>          <numeric>         <numeric>          <numeric>
Ces2g    1212.11866446455    1.6619688818668 0.234190186405423   7.09666321794395
Zbed6    374.510215344094   1.96733587666412 0.292968839962875   6.71517106363058
Arl5b    632.153990622569   1.38645758140518 0.226457985775024   6.12236118174503
Sprr2a3  512.346362949067  -1.76586374119559 0.290687670719661  -6.07478031945357
Chil4    418.261741611881  -11.1234731694243  1.82545268713692  -6.09354230203062
...                   ...                ...               ...                ...
Gpm6b    5.48672624992949  -3.26990311618554  1.68534439128271  -1.94019877070753
Tlr7    0.383215335528282  0.218975091076327  5.66317890387017  0.038666461857082
Arhgap6  1.15106272316865  0.698916321397709  3.06886540243518  0.227744208280726
Spry3    2.98916938673755 -0.762318608018375  2.34004432231902 -0.325771012432322
Zf12    0.196069987203268   1.54171624416706   8.1299002488468  0.189635321095821
                      pvalue                 padj
                   <numeric>            <numeric>
Ces2g   1.27804989896622e-12 1.68638684168593e-08
Zbed6   1.87845717757675e-11 1.23931212290626e-07
Arl5b   9.21987334038214e-10 3.27651831780399e-06
Sprr2a3 1.24157571724289e-09 3.27651831780399e-06
Chil4   1.10439133480463e-09 3.27651831780399e-06
...                      ...                  ...
Gpm6b     0.0523555378668028                   NA
Tlr7       0.969156312963247                   NA
Arhgap6    0.819845102163337                   NA
Spry3      0.744597612065556                   NA
Zf12       0.849594907168991                   NA
summary(res_int_con_uvb_week)

out of 17196 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 68, 0.4%
LFC < 0 (down)     : 93, 0.54%
outliers [1]       : 0, 0%
low counts [2]     : 4001, 23%
(mean count < 6)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# How many adjusted p-values were less than 0.05?
sum(res_int_con_uvb_week$padj < 0.1, 
    na.rm = TRUE)
[1] 161
# MA plot
print(plotMA(res_int_con_uvb_week))
NULL

Tests if the effect of treating with UVB+SFN vs. treating with UVB is different at Week 15 compared to Week 2:

res_int_sfn_uvb_week <- results(dds, 
                                name = "time15w.trtSFN",
                                alpha = 0.1)
res_int_sfn_uvb_week <- res_int_sfn_uvb_week[order(res_int_sfn_uvb_week$padj,
                                                   decreasing = FALSE),]
print(res_int_sfn_uvb_week)
log2 fold change (MLE): time15w.trtSFN 
Wald test p-value: time15w.trtSFN 
DataFrame with 17202 rows and 6 columns
                  baseMean     log2FoldChange             lfcSE               stat
                 <numeric>          <numeric>         <numeric>          <numeric>
Sprr2i    126.386323195034   2.84449817547931 0.472503858478252   6.02005279838415
Rabgap1l  987.854277497041  0.888170477648323 0.161280261780297   5.50700047137963
Lipg      411.991958764794     1.485142042575 0.270981126568133   5.48061062917455
Jakmip2     60.28009868717   3.21405954537184 0.601663275617683   5.34195732999027
Ankrd37   245.126852085138   1.70136669174856 0.356386435019607   4.77393785107156
...                    ...                ...               ...                ...
Asb11     1.43414827064918   1.28619798291892  2.67139832894984  0.481469936168051
Tlr7     0.383215335528282  -1.58707535511162  5.05399583107809 -0.314023875000521
Arhgap6   1.15106272316865  0.426895223174219  2.73796975991385  0.155916704933823
Spry3     2.98916938673755 -0.716569188800694  1.91268955607796 -0.374639567892055
Zf12     0.196069987203268  -1.67955051516172  7.26863294062649 -0.231068280498004
                       pvalue                 padj
                    <numeric>            <numeric>
Sprr2i   1.74360186971955e-09 2.41698091180524e-05
Rabgap1l 3.64999283732504e-08 0.000195851753101377
Lipg     4.23860380395421e-08 0.000195851753101377
Jakmip2   9.1948301681653e-08 0.000318646839477769
Ankrd37  1.80658153241744e-06  0.00500856664047412
...                       ...                  ...
Asb11       0.630182541814492                   NA
Tlr7        0.753502905668786                   NA
Arhgap6     0.876098677711789                   NA
Spry3       0.707928541795799                   NA
Zf12        0.817261756825782                   NA
summary(res_int_sfn_uvb_week)

out of 17196 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 14, 0.081%
LFC < 0 (down)     : 5, 0.029%
outliers [1]       : 0, 0%
low counts [2]     : 3334, 19%
(mean count < 3)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# How many adjusted p-values were less than 0.05?
sum(res_int_sfn_uvb_week$padj < 0.1, 
    na.rm = TRUE)
[1] 19
# MA plot
print(plotMA(res_int_sfn_uvb_week))
NULL

# NOTE: same as 
# res <- results(dds, 
#                   alpha = 0.05)
# res <- res[order(res$padj, decreasing = FALSE),]
# res

NOTE: By default, the results(dds)* prints the results for the last level of the last term, i.e. here it was for for the interaction term SFN vs. UVB at Week 15 vs. Week 2.

Genes with both interactions being significant

lgene.con <- unique(res_int_con_uvb_week@rownames[res_int_con_uvb_week$padj < 0.1])
lgene.sfn <- unique(res_int_sfn_uvb_week@rownames[res_int_sfn_uvb_week$padj < 0.1])
lgene <- lgene.con[lgene.con %in% lgene.sfn]
lgene <- lgene[!is.na(lgene)]
lgene
[1] "Samhd1"   "Pfkfb3"   "Rabgap1l" "Prss53"   "Dct"      "Jakmip2"  "Lipg"    

Plot of DESeq-normalizedcounts of genes with smallest adjusted p-value for the interaction term:

# Get the DESeq-normalize counts
dp1 <- list()
for (i in 1:length(lgene)) {
  out <- plotCounts(dds, 
                    gene = lgene[[i]],
                    intgroup = c("trt",
                                 "time"),
                    returnData = TRUE)
  dp1[[i]] <- data.table(Geneid = lgene[[i]],
                         Sample = rownames(out),
                         out)
}
dp1 <- rbindlist(dp1)
dp1$trt <- factor(dp1$trt,
                  levels = c("CON",
                             "UVB",
                             "SFN"))
dp1$time <- factor(dp1$time,
                   levels = c("02w",
                              "15w"),
                   labels = c("Week 2",
                              "Week 15"))
dp1$Geneid <- factor(dp1$Geneid,
                     levels = lgene)
dp1[, mu := mean(count,
                 na.rm = TRUE),
    by = c("Geneid",
           "trt",
           "time")]
dmu <- unique(dp1[, -c("Sample",
                       "count")])

p1 <- ggplot(dp1,
             aes(x = time,
                 y = count,
                 group = trt,
                 fill = trt)) +
  facet_wrap(~ Geneid,
             scale = "free_y") +
  geom_point(position = position_dodge(0.5),
             shape = 21,
             size = 5,
             color = "black") +
  geom_line(data = dmu,
            aes(x = time,
                y = mu,
                group = trt,
                colour = trt),
            position = position_dodge(0.5),
            alpha = 0.5,
            size = 2) +
  scale_x_discrete("") +
  scale_y_continuous("DESeq-Normalized Counts") +
  scale_fill_discrete("Treatment")
print(p1)

Compare to the plot of TPM-normalizedcounts of genes with smallest adjusted p-value for the interaction term:

# Examine TPM values for the same genes
tmp <- tpm[Geneid %in% lgene, ]
tmp$Geneid <- factor(tmp$Geneid,
                     levels = lgene)
tmp <- melt.data.table(data = tmp,
                       id.vars = 1,
                       measure.vars = 3:ncol(tmp),
                       variable.name = "Sample",
                       value.name = "TPM")
tmp <- merge(dmeta,
             tmp,
             by = "Sample")

p1 <- ggplot(tmp,
             aes(x = Week,
                 y = TPM,
                 fill = Treatment,
                 group = Treatment)) +
  facet_wrap(~ Geneid,
             scales = "free_y") +
  geom_point(position = position_dodge(0.5),
             shape = 21,
             size = 5,
             color = "black")+
  scale_x_discrete("")
plot(p1)

Session Information

sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods  
[9] base     

other attached packages:
 [1] scales_1.0.0                threejs_0.3.1              
 [3] igraph_1.2.4.1              plotly_4.9.0               
 [5] ggplot2_3.1.1               readxl_1.3.1               
 [7] DESeq2_1.24.0               SummarizedExperiment_1.14.0
 [9] DelayedArray_0.10.0         BiocParallel_1.17.18       
[11] matrixStats_0.54.0          Biobase_2.44.0             
[13] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
[15] IRanges_2.18.1              S4Vectors_0.22.0           
[17] BiocGenerics_0.30.0         DT_0.6                     
[19] data.table_1.12.2           knitr_1.22                 

loaded via a namespace (and not attached):
 [1] httr_1.4.0             tidyr_0.8.3            viridisLite_0.3.0     
 [4] jsonlite_1.6           bit64_0.9-7            splines_3.6.0         
 [7] shiny_1.3.2            Formula_1.2-3          assertthat_0.2.1      
[10] latticeExtra_0.6-28    blob_1.1.1             GenomeInfoDbData_1.2.1
[13] cellranger_1.1.0       yaml_2.2.0             pillar_1.4.1          
[16] RSQLite_2.1.1          backports_1.1.4        lattice_0.20-38       
[19] glue_1.3.1             digest_0.6.19          promises_1.0.1        
[22] RColorBrewer_1.1-2     XVector_0.24.0         checkmate_1.9.3       
[25] colorspace_1.4-1       httpuv_1.5.1           htmltools_0.3.6       
[28] Matrix_1.2-17          plyr_1.8.4             XML_3.98-1.19         
[31] pkgconfig_2.0.2        genefilter_1.66.0      zlibbioc_1.30.0       
[34] purrr_0.3.2            xtable_1.8-4           later_0.8.0           
[37] htmlTable_1.13.1       tibble_2.1.2           annotate_1.62.0       
[40] withr_2.1.2            nnet_7.3-12            lazyeval_0.2.2        
[43] mime_0.6               survival_2.44-1.1      magrittr_1.5          
[46] crayon_1.3.4           memoise_1.1.0          foreign_0.8-71        
[49] tools_3.6.0            stringr_1.4.0          munsell_0.5.0         
[52] locfit_1.5-9.1         cluster_2.0.9          AnnotationDbi_1.46.0  
[55] compiler_3.6.0         rlang_0.3.4            grid_3.6.0            
[58] RCurl_1.95-4.12        rstudioapi_0.10        htmlwidgets_1.3       
[61] crosstalk_1.0.0        labeling_0.3           bitops_1.0-6          
[64] base64enc_0.1-3        gtable_0.3.0           DBI_1.0.0             
[67] R6_2.4.0               gridExtra_2.3          dplyr_0.8.1           
[70] bit_1.1-14             Hmisc_4.2-0            stringi_1.4.3         
[73] Rcpp_1.0.1             geneplotter_1.62.0     rpart_4.1-15          
[76] acepack_1.4.1          tidyselect_0.2.5       xfun_0.7              
---
title: "Skin UVB SKH1 mouse model treated with SFN "
output:
  html_notebook:
    toc: yes
    toc_float: yes
    code_folding: hide
---

# Part 1: RNA
```{r header, echo = FALSE, message = FALSE, error = FALSE, warning  =FALSE}
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("DESeq2")

require(knitr)
require(data.table)
require(DT)
require(DESeq2)
require(readxl)
require(BiocParallel)
require(ggplot2)
require(plotly)
require(threejs)
require(scales)

# NOTE: on DESeq2 Output: 'baseMean' is the average of the normalized count values, 
# divided by the size factors, taken over all samples in the DESeqDataSet
```

## Load RNA samples
Out of 30 samples, we selected 17 for this study. These are the normal tissue samples form the control, the UVA and the UVA+SFN treatment groups. normal tissue samples from the UVB_UA groups as well as tumor samples were excluded from this analysis. Additionally, one of the control samples at Week 2 (baseline) was removed after outlier analysis.    
7,219 genes with zero counts in > 80% (> 13 out of 18) of samples were removed. 17,202 out of 24,421 genes were left. 
         
```{r data_rna, warning = FALSE, echo = FALSE, message = FALSE}
# Load data----
dt0 <- fread("data/renyi_dedup_rnaseq_data/featurescounts_uvb-skin_dedup_renyi_2-9-2018.csv",
             skip = 1)

# Remove unused columns----
dt1 <- dt0[, c(1, 6:ncol(dt0)), with = FALSE]

cnames <- colnames(dt1)[-c(1:2)]
cnames <- gsub(x = cnames,
               pattern = ".dedup.bam",
               replacement = "")
colnames(dt1)[-c(1:2)] <- cnames

# ATTENTION! In this analysis, we will only examine controls and SFN
# Also, removed cancer cell samples
tnames <- substr(x = colnames(dt1), 
                 start = 3,
                 stop = 3)

gnames <- substr(x = colnames(dt1), 
                 start = 5,
                 stop = 7)

dt1 <- dt1[, gnames %in% c("id",
                           "th",
                           "CON",
                           "UVB",
                           "SFN" ) &
             tnames != "t",
           with = FALSE]
# 18 samples left

# Remove sample '02w_CON_1' as an outlier
# See 'skin_uvb_sfn_exclude_con2w1_v1' for details
dt1 <- dt1[, colnames(dt1) != "02w_CON_1", with = FALSE]

# Remove genes with zero counts in > 80% (> 13 out of 17) of samples
tmp <- dt1[, -c(1:2)] == 0
tmp <- rowSums(tmp) > 13
sum(tmp)

dt1 <- droplevels(dt1[!tmp, ])
nrow(dt1)
# 17,202 out of 24,421 genes left

datatable(head(dt1, 10),
          rownames = FALSE,
          options = list(pageLength = 10),
          caption = "Table 1: first 10 rows of the count table")
```

## Transcripts per kilobase million (TPM) normalization
Next, we noramized the counts. To convert number of hits to  the relative abundane of genes in each sample, we used ***transcripts per kilobase million (TPM)*** normalization, which is as following for the j-th sample:       
1. normilize for gene length: a[i, j] = 1,000*count[i, j]/gene[i, j] length(bp)     
2. normalize for seq depth (i.e. total count): a(i, j)/sum(a[, j])     
3. multiply by one million     
A very good comparison of normalization techniques can be found at the following video:    
[RPKM, FPKM and TPM, clearly explained](https://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/)
     
After the normalization, each sample's total is 1M:
     
```{r tpm, warning = FALSE, echo = FALSE, message = FALSE}
# Normalize counts to TPM
tmp <- 1000*dt1[, 3:ncol(dt1)]/dt1$Length
tpm <- data.table(Geneid = dt1$Geneid,
                  Length = dt1$Length,
                  apply(tmp,
                        2,
                        function(a) {
                          10^6*(a/sum(a))
                        }))
colSums(tpm[, -c(1:2)])

formatRound(datatable(head(tpm, 10),
                      rownames = FALSE,
                      options = list(pageLength = 10),
                      caption = "Table 2: transcripts per kilobase million (TPM) normalized counts"),
            columns = 3:ncol(tpm),
            digits = 2)

# Total TPM
total <- rowSums(tpm[, 3:ncol(tpm)])

# Sort genes by relative abundancy
tpm$Geneid <- factor(tpm$Geneid ,
                     levels = tpm$Geneid[order(total,
                                               decreasing = FALSE)])
```

# Top 100 most abundant RNA molecules
```{r most_abundant}
# Separate top 100 abundant genes
tmp <- droplevels(tpm[Geneid %in% levels(tpm$Geneid)[(nrow(tpm) - 99):nrow(tpm)]])

tmp <- melt.data.table(data = tmp,
                       id.vars = 1:2,
                       measure.vars = 3:ncol(tmp),
                       variable.name = "Sample",
                       value.name = "TPM")

tmp$Week <- substr(x = tmp$Sample,
                   start = 1,
                   stop = 3)
tmp$Week <- factor(tmp$Week,
                   levels = unique(tmp$Week))


tmp$Treatment <- substr(x = tmp$Sample,
                        start = 5,
                        stop = 7)
tmp$Treatment <- factor(tmp$Treatment,
                        levels = c("CON", 
                                   "UVB",
                                   "SFN"))

tmp$Replica <- substr(x = tmp$Sample,
                      start = 9,
                      stop = 9)
tmp$Replica <- factor(tmp$Replica,
                      levels = 0:1)

# Plot top 100 abundant genes
p2 <- ggplot(tmp,
             aes(x = TPM,
                 y = Geneid,
                 fill = Treatment,
                 shape = Week)) +
  # facet_wrap(~ Sex, nrow = 1) +
  geom_point(size = 3,
             alpha = 0.5) +
  geom_vline(xintercept = 1,
             linetype = "dashed")
ggplotly(p2)
```

# Bottom 100 least abundant RNA molecules
```{r least_abundant}
tmp <- droplevels(tpm[Geneid %in% levels(tpm$Geneid)[1:100]])

tmp <- melt.data.table(data = tmp,
                       id.vars = 1:2,
                       measure.vars = 3:ncol(tmp),
                       variable.name = "Sample",
                       value.name = "TPM")

tmp$Week <- substr(x = tmp$Sample,
                   start = 1,
                   stop = 3)
tmp$Week <- factor(tmp$Week,
                   levels = unique(tmp$Week))


tmp$Treatment <- substr(x = tmp$Sample,
                        start = 5,
                        stop = 7)
tmp$Treatment <- factor(tmp$Treatment,
                        levels = c("CON", 
                                   "UVB",
                                   "SFN"))

tmp$Replica <- substr(x = tmp$Sample,
                      start = 9,
                      stop = 9)
tmp$Replica <- factor(tmp$Replica,
                      levels = 0:1)

# Plot top 100 abundant genes
p3 <- ggplot(tmp,
             aes(x = TPM,
                 y = Geneid,
                 fill = Treatment,
                 shape = Week)) +
  # facet_wrap(~ Sex, nrow = 1) +
  geom_point(size = 3,
             alpha = 0.5) +
  geom_vline(xintercept = 1,
             linetype = "dashed")
ggplotly(p3)
```

# Meta data
```{r meta}
dmeta <- data.table(Sample = colnames(dt1)[-c(1:2)])

dmeta$time <- substr(x = dmeta$Sample,
                     start = 1,
                     stop = 3)
dmeta$time <- factor(dmeta$time,
                     levels = c("02w",
                                "15w",
                                "25w"))
dmeta$Week <- factor(dmeta$time,
                     levels = c("02w",
                                "15w",
                                "25w"),
                     labels = c("Week 2",
                                "Week 15",
                                "Week 25"))

dmeta$trt <- substr(x = dmeta$Sample,
                    start = 5,
                    stop = 7)
dmeta$trt <- factor(dmeta$trt,
                    levels = c("CON", 
                               "UVB",
                               "SFN"))
dmeta$Treatment <- factor(dmeta$trt,
                          levels = c("CON", 
                                     "UVB",
                                     "SFN"),
                          labels = c("Negative Control",
                                     "Positive Control (UVB)",
                                     "Sulforaphane (SFN)"))

dmeta$Replica <- substr(x = dmeta$Sample,
                        start = 9,
                        stop = 9)
dmeta$Replica <- factor(dmeta$Replica,
                        levels = 0:1)

datatable(dmeta,
          options = list(pageLength = nrow(dmeta)))
```

# PCA of TPM
NOTE: the distributions are skewed. To make them symmetric, log transformation is often applied. However, there is an issue of zeros. In this instance, we added a small values ***lambda[i]*** equal to 1/10 of the smallest non-zero value of *i*-th gene. 
```{r pca}
dm.tpm <- as.matrix(tpm[, -c(1:2), with = FALSE])
rownames(dm.tpm) <- tpm$Geneid

# # Remove 02w_CON_1 sample and redo PCA
# dm.tpm <- dm.tpm[, colnames(dm.tpm) != "02w_CON_1"]
# dmeta <- dmeta[dmeta$Sample != "02w_CON_1", ]

# Add lambdas to all values, then take a log
dm.ltpm <- t(apply(X = dm.tpm,
                      MARGIN = 1,
                      FUN = function(a) {
                        lambda <- min(a[a > 0])/10
                        log(a + lambda)
                      }))

# PCA----
m1 <- prcomp(t(dm.ltpm),
             center = TRUE,
             scale. = TRUE)

s1 <- summary(m1)
s1
```

# Pareto chart of variance explained by principal components
```{r pca_var_plot}
imp <- data.table(PC = colnames(s1$importance),
                  Variance = 100*s1$importance[2, ],
                  Cumulative = 100*s1$importance[3, ])
imp$PC <- factor(imp$PC,
                 levels = imp$PC)
p1 <- ggplot(imp,
             aes(x = PC,
                 y = Variance)) +
  geom_bar(stat = "identity",
           fill = "grey",
           color = "black") +
  geom_line(aes(y = rescale(Cumulative,
                            to = c(min(Cumulative)*30/100,
                                   30)),
                group = rep(1, nrow(imp)))) +
  geom_point(aes(y = rescale(Cumulative,
                             to = c(min(Cumulative)*30/100,
                                    30)))) +
  scale_y_continuous("% Variance Explained",
                     breaks = seq(0, 30, by = 5),
                     labels = paste(seq(0, 30, by = 5),
                                    "%",
                                    sep = ""),
                     sec.axis = sec_axis(trans = ~.,
                                         name = "% Cumulative Variance",
                                         breaks = seq(0, 30, length.out = 5),
                                         labels = paste(seq(0, 100, length.out = 5),
                                                        "%",
                                                        sep = ""))) +
  scale_x_discrete("") +
  theme(axis.text.x = element_text(angle = 90,
                                   hjust = 1))
p1
```

# First 3 principal components, pairwise
```{r pca_plots}
# Biplot while keep only the most important variables (Javier)----
# Select PC-s to pliot (PC1 & PC2)
choices <- c(1:3)

# Scores, i.e. points (df.u)
dt.scr <- data.table(m1$x[, choices])
# Add grouping variables
dt.scr$trt <- dmeta$trt
dt.scr$time <- dmeta$time
dt.scr$sample <- dmeta$Sample

# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)

# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[choices], 
                     sprintf('(%0.1f%% explained var.)', 
                             100*m1$sdev[choices]^2/sum(m1$sdev^2)))

p1 <- ggplot(data = dt.scr,
             aes(x = PC1,
                 y = PC2,
                 color = trt,
                 shape = time)) +
  geom_point(size = 4,
             alpha = 0.5) +
  scale_x_continuous(u.axis.labs[1]) +
  scale_y_continuous(u.axis.labs[2])
ggplotly(p1)

p1 <- ggplot(data = dt.scr,
             aes(x = PC1,
                 y = PC3,
                 color = trt,
                 shape = time)) +
  geom_point(size = 4,
             alpha = 0.5) +
  scale_x_continuous(u.axis.labs[1]) +
  scale_y_continuous(u.axis.labs[3])
ggplotly(p1)

p1 <- ggplot(data = dt.scr,
             aes(x = PC2,
                 y = PC3,
                 color = trt,
                 shape = time)) +
  geom_point(size = 4,
             alpha = 0.5) +
  scale_x_continuous(u.axis.labs[2]) +
  scale_y_continuous(u.axis.labs[3])
ggplotly(p1)
```

# First 3 principal components, 3D
```{r pca_3d, fig.height = 10, fig.width = 10}
scatterplot3js(x = dt.scr$PC1, 
               y = dt.scr$PC2, 
               z = dt.scr$PC3, 
               color = as.numeric(dt.scr$trt),
               renderer = "auto",
               pch = dt.scr$sample,
               size = 0.1)
```

# Differential expression analysis (DESeq2 pipeline)
Sources:    
1. [Analyzing RNA-seq data with DESeq2:Interactions](https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions)     
2. [Bioconductor Question: DESeq2 time series analysis](https://support.bioconductor.org/p/97430/)      
We are testing a model with time*treatment interaction. The idea here is to find genes with significant interaction term. That would suggest that the gene expressiondifferences between the treatments depended on time. THere are several possible scenarios:    
a. No difference between the negative control and the positive control groups at baseline, significant difference at the later time point. This will show the effect of the disease (UVB radiation, in this case).     
b. Significant difference between the control groups at baseline, no difference at the later time point. Same as (a) above.     
c. Differences between the positive control and the SFN-treated groups. Here, we are interested in the reversal of UVB effect. Again, the interaction term will need to be significant for the reasons described above.      
    
First, we will compare weeks 2 and 15. Previous analyses showed that majority of cahnges happend at by this time. far less genes were differentially expressed between weeks 15 and 25.
```{r deseq2_week2_week15}
# Baseline vs. Week 15 only
dmeta.15 <- droplevels(dmeta[Week != "Week 25", ])

# Relevel: make all comparisons with the positive control (UVB)
dmeta.15$trt <- factor(dmeta.15$trt,
                       levels = c("UVB",
                                  "CON",
                                  "SFN"))

dtm.15 <- as.matrix(dt1[, dmeta.15$Sample,
                        with = FALSE])
rownames(dtm.15) <- dt1$Geneid

dds <- DESeqDataSetFromMatrix(countData = dtm.15, 
                              colData = dmeta.15,
                              ~ time + trt + time:trt)
# If all samples contain zeros, geometric means cannot be
# estimated. Change default 'type = "ratio"' to 'type = "poscounts"'.
# Type '?DESeq2::estimateSizeFactors' for more details.
dds <- estimateSizeFactors(object = dds,
                           type = "poscounts")

# Run DESeq----
dds <- DESeq(object = dds,
             # test = "LRT",
             # reduced = ~ time + trt,
             fitType = "local",
             sfType = "ratio",
             parallel = FALSE)

# NOTE (from DESeq help file, section Value):
# A DESeqDataSet object with results stored as metadata columns. 
# These results should accessed by calling the results function. 
# By default this will return the log2 fold changes and p-values
# for the last variable in the design formula. 
# See results for how to access results for other variables.
# In this case, the last term is the interaction term trt:time

resultsNames(dds)
# model.matrix(~ time + trt + time:trt, dmeta.15)
```

# Results: interactions terms
Tests if the effect of NOT treating with UVB vs. treating with UVB is different at Week 15 compared to Week 2:    
```{r deseq2_week2_week15_results_int_con_uvb}
res_int_con_uvb_week <- results(dds, 
                                name = "time15w.trtCON",
                                alpha = 0.1)
res_int_con_uvb_week <- res_int_con_uvb_week[order(res_int_con_uvb_week$padj,
                                                   decreasing = FALSE),]
print(res_int_con_uvb_week)
summary(res_int_con_uvb_week)

# How many adjusted p-values were less than 0.05?
sum(res_int_con_uvb_week$padj < 0.1, 
    na.rm = TRUE)

# MA plot
print(plotMA(res_int_con_uvb_week))
```

Tests if the effect of treating with UVB+SFN vs. treating with UVB is different at Week 15 compared to Week 2:    
```{r deseq2_week2_week15_results_int_sfn_uvb}
res_int_sfn_uvb_week <- results(dds, 
                                name = "time15w.trtSFN",
                                alpha = 0.1)
res_int_sfn_uvb_week <- res_int_sfn_uvb_week[order(res_int_sfn_uvb_week$padj,
                                                   decreasing = FALSE),]
print(res_int_sfn_uvb_week)
summary(res_int_sfn_uvb_week)

# How many adjusted p-values were less than 0.05?
sum(res_int_sfn_uvb_week$padj < 0.1, 
    na.rm = TRUE)

# MA plot
print(plotMA(res_int_sfn_uvb_week))

# NOTE: same as 
# res <- results(dds, 
#                   alpha = 0.05)
# res <- res[order(res$padj, decreasing = FALSE),]
# res
```

**NOTE**: By default, the **results(dds)*** prints the results for the last level of the last term, i.e. here it was for for the interaction term SFN vs. UVB at Week 15 vs. Week 2.
       
# Genes with both interactions being significant
```{r sign_int}
lgene.con <- unique(res_int_con_uvb_week@rownames[res_int_con_uvb_week$padj < 0.1])
lgene.sfn <- unique(res_int_sfn_uvb_week@rownames[res_int_sfn_uvb_week$padj < 0.1])
lgene <- lgene.con[lgene.con %in% lgene.sfn]
lgene <- lgene[!is.na(lgene)]
lgene
```

       
Plot of DESeq-normalizedcounts of genes with smallest adjusted p-value for the interaction term:     
```{r deseq2_week2_week15_top9_deseqnorm, fig.height = 6, fig.width = 8}
# Get the DESeq-normalize counts
dp1 <- list()
for (i in 1:length(lgene)) {
  out <- plotCounts(dds, 
                    gene = lgene[[i]],
                    intgroup = c("trt",
                                 "time"),
                    returnData = TRUE)
  dp1[[i]] <- data.table(Geneid = lgene[[i]],
                         Sample = rownames(out),
                         out)
}
dp1 <- rbindlist(dp1)
dp1$trt <- factor(dp1$trt,
                  levels = c("CON",
                             "UVB",
                             "SFN"))
dp1$time <- factor(dp1$time,
                   levels = c("02w",
                              "15w"),
                   labels = c("Week 2",
                              "Week 15"))
dp1$Geneid <- factor(dp1$Geneid,
                     levels = lgene)
dp1[, mu := mean(count,
                 na.rm = TRUE),
    by = c("Geneid",
           "trt",
           "time")]
dmu <- unique(dp1[, -c("Sample",
                       "count")])

p1 <- ggplot(dp1,
             aes(x = time,
                 y = count,
                 group = trt,
                 fill = trt)) +
  facet_wrap(~ Geneid,
             scale = "free_y") +
  geom_point(position = position_dodge(0.5),
             shape = 21,
             size = 5,
             color = "black") +
  geom_line(data = dmu,
            aes(x = time,
                y = mu,
                group = trt,
                colour = trt),
            position = position_dodge(0.5),
            alpha = 0.5,
            size = 2) +
  scale_x_discrete("") +
  scale_y_continuous("DESeq-Normalized Counts") +
  scale_fill_discrete("Treatment")
print(p1)
```
      
Compare to the plot of TPM-normalizedcounts of genes with smallest adjusted p-value for the interaction term:     
```{r deseq2_week2_week15_tpmnorm, fig.height = 6, fig.width = 8}
# Examine TPM values for the same genes
tmp <- tpm[Geneid %in% lgene, ]
tmp$Geneid <- factor(tmp$Geneid,
                     levels = lgene)
tmp <- melt.data.table(data = tmp,
                       id.vars = 1,
                       measure.vars = 3:ncol(tmp),
                       variable.name = "Sample",
                       value.name = "TPM")
tmp <- merge(dmeta,
             tmp,
             by = "Sample")

p1 <- ggplot(tmp,
             aes(x = Week,
                 y = TPM,
                 fill = Treatment,
                 group = Treatment)) +
  facet_wrap(~ Geneid,
             scales = "free_y") +
  geom_point(position = position_dodge(0.5),
             shape = 21,
             size = 5,
             color = "black")+
  scale_x_discrete("")
plot(p1)
```

# Session Information
```{r info,eval=TRUE}
sessionInfo()
```